knitr::opts_chunk$set(warning = FALSE, message = FALSE)
library(tidyverse)
library(lsa)
library(readxl)
library(plotly)
library(countrycode)
df <- read_delim('../../data/country_portfolios_dimensions.csv',delim = ';') %>% 
  rename(N=p) %>% 
  filter(type !='all') %>% 
  mutate(type = case_match(type,
                           'national' ~ 'National',
                           'international' ~ 'International',
                           'migrant' ~ 'Mobile'),
         country_code =countrycode(country_code, origin = 'iso2c', destination = 'country.name')
)
countries_keep <- df %>% group_by(country_code) %>% 
  summarise(N=sum(N)) %>% 
  filter(N>1000) %>% 
  pull(country_code)

df <- df %>% 
  filter(country_code %in% countries_keep)
df_clean <- df %>% 
  group_by(type,for_group_id,country_code) %>% 
  summarise(N = sum(N)) %>% 
  ungroup() %>% 
  mutate(for_group_id = factor(for_group_id),
         country_code = factor(country_code)) %>% 
     complete(type,for_group_id,country_code,fill=list(N=0))

#filter empty topics

empty_topics <- df_clean %>% 
  group_by(for_group_id,country_code) %>% 
  summarise(N= sum(N)) %>% 
  filter(N==0)

df_clean2 <- df_clean %>% 
  anti_join(empty_topics)
df_clean <- df_clean %>% 
  group_by(type,country_code) %>% 
  reframe(for_group_id,
          N,
          p = N/sum(N))

df_clean2 <- df_clean2 %>% 
  group_by(type,country_code) %>% 
  reframe(for_group_id,
          N,
          p = N/sum(N))
cossims <- function(df){
  df %>% 
  select(-country_code,-N) %>% 
  pivot_wider(names_from = type,values_from = p) %>% 
  summarise(cos_IM = cosine(International,Mobile),
            cos_IN = cosine(International,National),
            cos_MN = cosine(Mobile,National))
}
cosine_sim <- df_clean %>% 
  group_by(country_code) %>% 
  cossims()

size versus change

ggplotly(
df %>% group_by(country_code) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim) %>% 
  pivot_longer(cols = cos_IM:cos_MN,names_to = 'relation',values_to = 'cosine',names_prefix = 'cos_') %>% 
  mutate(relation = case_when(relation=='IM' ~'International-Mobile',
                              relation=='IN' ~'International-National',
                              relation=='MN' ~'Mobile-National',
                              )) %>% 
  filter(country_code!='ZZALL') %>% 
  ggplot(aes(N, cosine, color=relation, label=country_code)) +
  geom_point()+
  
  scale_x_log10()
)

Try to normalize size

compare all papers vs all papers minus (international/Mobile/National)

cossims_diff <- function(df){
  df %>%
  select(-country_code,-p) %>% 
  pivot_wider(id_cols = c(country_code,for_group_id),names_from = type,values_from = N) %>% 
  mutate(All = International + Mobile + National,
         International = All - International,
         Mobile = All - Mobile,
         National = All - National) %>% 
  mutate(International = International/sum(International),
         Mobile = Mobile/sum(Mobile),
         National = National/sum(National)) %>% 
  summarise(International = cosine(International,All),
            National = cosine(National,All),
            Mobile = cosine(Mobile,All))
}
cosine_sim_diff <- df_clean %>% 
  group_by(country_code) %>% 
  cossims_diff()
ggplotly(
df %>% group_by(country_code) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim_diff) %>% 
  pivot_longer(cols = International:Mobile,names_to = 'relation',values_to = 'cosine') %>% 
  filter(country_code!='ZZALL') %>% 
  ggplot(aes(N, cosine, color=relation, label=country_code)) +
  geom_point()+
  
  scale_x_log10()
)

group sizes

ggplotly(
df %>% group_by(country_code, type) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim_diff %>% pivot_longer(International:Mobile, names_to='type', values_to='cosine')) %>% 
  # pivot_longer(cols = International:Mobile,names_to = 'relation',values_to = 'cosine') %>% 
  filter(country_code!='ZZALL') %>% 
  group_by(country_code) %>% 
  mutate(country_N = sum(N)) %>% 
  ggplot(aes(country_N, cosine, color=type,size=N, label=country_code)) +
  geom_point()+
  # 
  scale_x_log10()
)
NA

excluding topics that have all 0’s


cosine_sim_diff2 <- df_clean2 %>% 
  group_by(country_code) %>% 
  cossims_diff()

ggplotly(
df %>% group_by(country_code, type) %>% 
  summarise(N=sum(N)) %>% 
  right_join(cosine_sim_diff2 %>% pivot_longer(International:Mobile, names_to='type', values_to='cosine')) %>% 
  # pivot_longer(cols = International:Mobile,names_to = 'relation',values_to = 'cosine') %>% 
  filter(country_code!='ZZALL') %>% 
  group_by(country_code) %>% 
  mutate(country_N = sum(N)) %>% 
  ggplot(aes(country_N, cosine, color=type,size=N, label=country_code)) +
  geom_point()+
  
  scale_x_log10()
)
NA
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyIHNldHVwfQprbml0cjo6b3B0c19jaHVuayRzZXQod2FybmluZyA9IEZBTFNFLCBtZXNzYWdlID0gRkFMU0UpCmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGxzYSkKbGlicmFyeShyZWFkeGwpCmxpYnJhcnkocGxvdGx5KQpsaWJyYXJ5KGNvdW50cnljb2RlKQpgYGAKCgpgYGB7cn0KZGYgPC0gcmVhZF9kZWxpbSgnLi4vLi4vZGF0YS9jb3VudHJ5X3BvcnRmb2xpb3NfZGltZW5zaW9ucy5jc3YnLGRlbGltID0gJzsnKSAlPiUgCiAgcmVuYW1lKE49cCkgJT4lIAogIGZpbHRlcih0eXBlICE9J2FsbCcpICU+JSAKICBtdXRhdGUodHlwZSA9IGNhc2VfbWF0Y2godHlwZSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgJ25hdGlvbmFsJyB+ICdOYXRpb25hbCcsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICdpbnRlcm5hdGlvbmFsJyB+ICdJbnRlcm5hdGlvbmFsJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgJ21pZ3JhbnQnIH4gJ01vYmlsZScpLAogICAgICAgICBjb3VudHJ5X2NvZGUgPWNvdW50cnljb2RlKGNvdW50cnlfY29kZSwgb3JpZ2luID0gJ2lzbzJjJywgZGVzdGluYXRpb24gPSAnY291bnRyeS5uYW1lJykKKQoKYGBgCgpgYGB7cn0KY291bnRyaWVzX2tlZXAgPC0gZGYgJT4lIGdyb3VwX2J5KGNvdW50cnlfY29kZSkgJT4lIAogIHN1bW1hcmlzZShOPXN1bShOKSkgJT4lIAogIGZpbHRlcihOPjEwMDApICU+JSAKICBwdWxsKGNvdW50cnlfY29kZSkKCmRmIDwtIGRmICU+JSAKICBmaWx0ZXIoY291bnRyeV9jb2RlICVpbiUgY291bnRyaWVzX2tlZXApCmBgYAoKYGBge3J9CmRmX2NsZWFuIDwtIGRmICU+JSAKICBncm91cF9ieSh0eXBlLGZvcl9ncm91cF9pZCxjb3VudHJ5X2NvZGUpICU+JSAKICBzdW1tYXJpc2UoTiA9IHN1bShOKSkgJT4lIAogIHVuZ3JvdXAoKSAlPiUgCiAgbXV0YXRlKGZvcl9ncm91cF9pZCA9IGZhY3Rvcihmb3JfZ3JvdXBfaWQpLAogICAgICAgICBjb3VudHJ5X2NvZGUgPSBmYWN0b3IoY291bnRyeV9jb2RlKSkgJT4lIAogICAgIGNvbXBsZXRlKHR5cGUsZm9yX2dyb3VwX2lkLGNvdW50cnlfY29kZSxmaWxsPWxpc3QoTj0wKSkKCiNmaWx0ZXIgZW1wdHkgdG9waWNzCgplbXB0eV90b3BpY3MgPC0gZGZfY2xlYW4gJT4lIAogIGdyb3VwX2J5KGZvcl9ncm91cF9pZCxjb3VudHJ5X2NvZGUpICU+JSAKICBzdW1tYXJpc2UoTj0gc3VtKE4pKSAlPiUgCiAgZmlsdGVyKE49PTApCgpkZl9jbGVhbjIgPC0gZGZfY2xlYW4gJT4lIAogIGFudGlfam9pbihlbXB0eV90b3BpY3MpCmBgYAoKYGBge3J9CmRmX2NsZWFuIDwtIGRmX2NsZWFuICU+JSAKICBncm91cF9ieSh0eXBlLGNvdW50cnlfY29kZSkgJT4lIAogIHJlZnJhbWUoZm9yX2dyb3VwX2lkLAogICAgICAgICAgTiwKICAgICAgICAgIHAgPSBOL3N1bShOKSkKCmRmX2NsZWFuMiA8LSBkZl9jbGVhbjIgJT4lIAogIGdyb3VwX2J5KHR5cGUsY291bnRyeV9jb2RlKSAlPiUgCiAgcmVmcmFtZShmb3JfZ3JvdXBfaWQsCiAgICAgICAgICBOLAogICAgICAgICAgcCA9IE4vc3VtKE4pKQpgYGAKCgpgYGB7cn0KY29zc2ltcyA8LSBmdW5jdGlvbihkZil7CiAgZGYgJT4lIAogIHNlbGVjdCgtY291bnRyeV9jb2RlLC1OKSAlPiUgCiAgcGl2b3Rfd2lkZXIobmFtZXNfZnJvbSA9IHR5cGUsdmFsdWVzX2Zyb20gPSBwKSAlPiUgCiAgc3VtbWFyaXNlKGNvc19JTSA9IGNvc2luZShJbnRlcm5hdGlvbmFsLE1vYmlsZSksCiAgICAgICAgICAgIGNvc19JTiA9IGNvc2luZShJbnRlcm5hdGlvbmFsLE5hdGlvbmFsKSwKICAgICAgICAgICAgY29zX01OID0gY29zaW5lKE1vYmlsZSxOYXRpb25hbCkpCn0KYGBgCgoKYGBge3J9CmNvc2luZV9zaW0gPC0gZGZfY2xlYW4gJT4lIAogIGdyb3VwX2J5KGNvdW50cnlfY29kZSkgJT4lIAogIGNvc3NpbXMoKQpgYGAKCiMjIHNpemUgdmVyc3VzIGNoYW5nZQoKYGBge3IsIGZpZy5oZWlnaHQ9MTIsIGZpZy53aWR0aD0xMn0KZ2dwbG90bHkoCmRmICU+JSBncm91cF9ieShjb3VudHJ5X2NvZGUpICU+JSAKICBzdW1tYXJpc2UoTj1zdW0oTikpICU+JSAKICByaWdodF9qb2luKGNvc2luZV9zaW0pICU+JSAKICBwaXZvdF9sb25nZXIoY29scyA9IGNvc19JTTpjb3NfTU4sbmFtZXNfdG8gPSAncmVsYXRpb24nLHZhbHVlc190byA9ICdjb3NpbmUnLG5hbWVzX3ByZWZpeCA9ICdjb3NfJykgJT4lIAogIG11dGF0ZShyZWxhdGlvbiA9IGNhc2Vfd2hlbihyZWxhdGlvbj09J0lNJyB+J0ludGVybmF0aW9uYWwtTW9iaWxlJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcmVsYXRpb249PSdJTicgfidJbnRlcm5hdGlvbmFsLU5hdGlvbmFsJywKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcmVsYXRpb249PSdNTicgfidNb2JpbGUtTmF0aW9uYWwnLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICApKSAlPiUgCiAgZmlsdGVyKGNvdW50cnlfY29kZSE9J1paQUxMJykgJT4lIAogIGdncGxvdChhZXMoTiwgY29zaW5lLCBjb2xvcj1yZWxhdGlvbiwgbGFiZWw9Y291bnRyeV9jb2RlKSkgKwogIGdlb21fcG9pbnQoKSsKICAKICBzY2FsZV94X2xvZzEwKCkKKQpgYGAKCiMjIFRyeSB0byBub3JtYWxpemUgc2l6ZQojIyBjb21wYXJlIGFsbCBwYXBlcnMgdnMgYWxsIHBhcGVycyBtaW51cyAoaW50ZXJuYXRpb25hbC9Nb2JpbGUvTmF0aW9uYWwpCgoKYGBge3J9CmNvc3NpbXNfZGlmZiA8LSBmdW5jdGlvbihkZil7CiAgZGYgJT4lCiAgc2VsZWN0KC1jb3VudHJ5X2NvZGUsLXApICU+JSAKICBwaXZvdF93aWRlcihpZF9jb2xzID0gYyhjb3VudHJ5X2NvZGUsZm9yX2dyb3VwX2lkKSxuYW1lc19mcm9tID0gdHlwZSx2YWx1ZXNfZnJvbSA9IE4pICU+JSAKICBtdXRhdGUoQWxsID0gSW50ZXJuYXRpb25hbCArIE1vYmlsZSArIE5hdGlvbmFsLAogICAgICAgICBJbnRlcm5hdGlvbmFsID0gQWxsIC0gSW50ZXJuYXRpb25hbCwKICAgICAgICAgTW9iaWxlID0gQWxsIC0gTW9iaWxlLAogICAgICAgICBOYXRpb25hbCA9IEFsbCAtIE5hdGlvbmFsKSAlPiUgCiAgbXV0YXRlKEludGVybmF0aW9uYWwgPSBJbnRlcm5hdGlvbmFsL3N1bShJbnRlcm5hdGlvbmFsKSwKICAgICAgICAgTW9iaWxlID0gTW9iaWxlL3N1bShNb2JpbGUpLAogICAgICAgICBOYXRpb25hbCA9IE5hdGlvbmFsL3N1bShOYXRpb25hbCkpICU+JSAKICBzdW1tYXJpc2UoSW50ZXJuYXRpb25hbCA9IGNvc2luZShJbnRlcm5hdGlvbmFsLEFsbCksCiAgICAgICAgICAgIE5hdGlvbmFsID0gY29zaW5lKE5hdGlvbmFsLEFsbCksCiAgICAgICAgICAgIE1vYmlsZSA9IGNvc2luZShNb2JpbGUsQWxsKSkKfQpgYGAKCmBgYHtyfQpjb3NpbmVfc2ltX2RpZmYgPC0gZGZfY2xlYW4gJT4lIAogIGdyb3VwX2J5KGNvdW50cnlfY29kZSkgJT4lIAogIGNvc3NpbXNfZGlmZigpCmBgYAoKYGBge3J9CmdncGxvdGx5KApkZiAlPiUgZ3JvdXBfYnkoY291bnRyeV9jb2RlKSAlPiUgCiAgc3VtbWFyaXNlKE49c3VtKE4pKSAlPiUgCiAgcmlnaHRfam9pbihjb3NpbmVfc2ltX2RpZmYpICU+JSAKICBwaXZvdF9sb25nZXIoY29scyA9IEludGVybmF0aW9uYWw6TW9iaWxlLG5hbWVzX3RvID0gJ3JlbGF0aW9uJyx2YWx1ZXNfdG8gPSAnY29zaW5lJykgJT4lIAogIGZpbHRlcihjb3VudHJ5X2NvZGUhPSdaWkFMTCcpICU+JSAKICBnZ3Bsb3QoYWVzKE4sIGNvc2luZSwgY29sb3I9cmVsYXRpb24sIGxhYmVsPWNvdW50cnlfY29kZSkpICsKICBnZW9tX3BvaW50KCkrCiAgCiAgc2NhbGVfeF9sb2cxMCgpCikKYGBgCgoKIyMgZ3JvdXAgc2l6ZXMKCmBgYHtyfQpnZ3Bsb3RseSgKZGYgJT4lIGdyb3VwX2J5KGNvdW50cnlfY29kZSwgdHlwZSkgJT4lIAogIHN1bW1hcmlzZShOPXN1bShOKSkgJT4lIAogIHJpZ2h0X2pvaW4oY29zaW5lX3NpbV9kaWZmICU+JSBwaXZvdF9sb25nZXIoSW50ZXJuYXRpb25hbDpNb2JpbGUsIG5hbWVzX3RvPSd0eXBlJywgdmFsdWVzX3RvPSdjb3NpbmUnKSkgJT4lIAogICMgcGl2b3RfbG9uZ2VyKGNvbHMgPSBJbnRlcm5hdGlvbmFsOk1vYmlsZSxuYW1lc190byA9ICdyZWxhdGlvbicsdmFsdWVzX3RvID0gJ2Nvc2luZScpICU+JSAKICBmaWx0ZXIoY291bnRyeV9jb2RlIT0nWlpBTEwnKSAlPiUgCiAgZ3JvdXBfYnkoY291bnRyeV9jb2RlKSAlPiUgCiAgbXV0YXRlKGNvdW50cnlfTiA9IHN1bShOKSkgJT4lIAogIGdncGxvdChhZXMoY291bnRyeV9OLCBjb3NpbmUsIGNvbG9yPXR5cGUsc2l6ZT1OLCBsYWJlbD1jb3VudHJ5X2NvZGUpKSArCiAgZ2VvbV9wb2ludCgpKwogICMgCiAgc2NhbGVfeF9sb2cxMCgpCikKCmBgYAoKZXhjbHVkaW5nIHRvcGljcyB0aGF0IGhhdmUgYWxsIDAncyAKYGBge3J9Cgpjb3NpbmVfc2ltX2RpZmYyIDwtIGRmX2NsZWFuMiAlPiUgCiAgZ3JvdXBfYnkoY291bnRyeV9jb2RlKSAlPiUgCiAgY29zc2ltc19kaWZmKCkKCmdncGxvdGx5KApkZiAlPiUgZ3JvdXBfYnkoY291bnRyeV9jb2RlLCB0eXBlKSAlPiUgCiAgc3VtbWFyaXNlKE49c3VtKE4pKSAlPiUgCiAgcmlnaHRfam9pbihjb3NpbmVfc2ltX2RpZmYyICU+JSBwaXZvdF9sb25nZXIoSW50ZXJuYXRpb25hbDpNb2JpbGUsIG5hbWVzX3RvPSd0eXBlJywgdmFsdWVzX3RvPSdjb3NpbmUnKSkgJT4lIAogICMgcGl2b3RfbG9uZ2VyKGNvbHMgPSBJbnRlcm5hdGlvbmFsOk1vYmlsZSxuYW1lc190byA9ICdyZWxhdGlvbicsdmFsdWVzX3RvID0gJ2Nvc2luZScpICU+JSAKICBmaWx0ZXIoY291bnRyeV9jb2RlIT0nWlpBTEwnKSAlPiUgCiAgZ3JvdXBfYnkoY291bnRyeV9jb2RlKSAlPiUgCiAgbXV0YXRlKGNvdW50cnlfTiA9IHN1bShOKSkgJT4lIAogIGdncGxvdChhZXMoY291bnRyeV9OLCBjb3NpbmUsIGNvbG9yPXR5cGUsc2l6ZT1OLCBsYWJlbD1jb3VudHJ5X2NvZGUpKSArCiAgZ2VvbV9wb2ludCgpKwogIAogIHNjYWxlX3hfbG9nMTAoKQopCgpgYGA=